Curious what congress and Trump are saying about the coronavirus?

This analysis takes a look at the number, the positive and negative sentiment, and the content of the tweets, for each party and how that’s evolving over time since February 1, 2020.

I utilize open data hosted online. In particular, big thanks to Alex Litel who created the Tweets of Congress repo, where I pulled congressional tweets from, and the folks running Trump Twitter Archive, where I pulled Trump’s tweets from.

The repo for this project is: https://github.com/dcosme/congress-tweets-covid19

prep data

load packages

library(tidyverse)
library(jsonlite)
library(tidytext)
library(ggwordcloud)
library(knitr)

define palettes

palette4 = wesanderson::wes_palette("Zissou1", 4, "continuous")
palette4 = c(palette4[1], palette4[2], palette4[4], palette4[3])
palette5 = wesanderson::wes_palette("Zissou1", 5, "continuous")
palette2 = c(palette5[2], palette5[4])

load congress twitter handles

congress_twitter = read.csv("~/Documents/code/US-Congress/116thCongress/116Congress.csv", stringsAsFactors = FALSE) %>%
  rename("name" = Wikipedia..Names) %>%
  gather(handle_type, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
  select(name, handle_type, twitter_handle) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         twitter_handle = ifelse(twitter_handle == "", NA, twitter_handle),
         name = gsub("<e9>", "é", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<e1>", "á", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<fa>", "ú", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("<ed>", "í", `Encoding<-`(name, "latin1"), fixed = TRUE),
         name = gsub("é", "e", name),
         name = gsub("á", "a", name),
         name = gsub("ú", "u", name),
         name = gsub("í", "i", name),
         name = trimws(name)) %>%
  extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE) %>%
  spread(handle_type, twitter_handle)

congress = read.csv("~/Documents/code/us-senate/us-senate/data/us-senate.csv", stringsAsFactors = FALSE) %>%
  bind_rows(read.csv("~/Documents/code/us-house/us-house/data/us-house.csv", stringsAsFactors = FALSE)) %>%
  select(state_name, title, party, name, gender, ethnicity, twitter_handle) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         twitter_handle = ifelse(twitter_handle == "", NA,
                          ifelse(twitter_handle == "housedemocrats", NA,
                          ifelse(twitter_handle == "senatorloeffler?lang=en", NA, twitter_handle))),
         name = gsub("é", "e", name),
         name = gsub("á", "a", name),
         name = gsub("ú", "u", name),
         name = gsub("í", "i", name),
         name = trimws(name)) %>%
  extract(name, c("first", "last"), "([A-Za-z]{1}).* (.*)", remove = FALSE)

congress_info = full_join(congress, congress_twitter, by = c("first", "last")) %>%
  gather(handle_type, twitter_handle, twitter_handle, ODU.WSDL, CSPAN, TweetCongress, Github) %>%
  select(state_name, title, party, first, last, gender, ethnicity, twitter_handle) %>%
  group_by(first, last) %>%
  fill(state_name, title, party, gender, ethnicity, twitter_handle, .direction = "updown") %>%
  unique() %>%
  filter(!is.na(state_name)) %>%
  ungroup() %>%
  mutate(last = tolower(last))

load congressional tweets

pull to update repo

cd ~/Documents/code/congresstweets
git pull origin master
## From https://github.com/alexlitel/congresstweets
##  * branch            master     -> FETCH_HEAD
## Already up to date.

load the files

file_dir = "~/Documents/code/congresstweets/data"
file_pattern = "2020-0[2-3]{1}-.*.json"
file_list = list.files(file_dir, pattern = file_pattern)

tweets_temp = data.frame()

for (file in file_list) {
  temp = tryCatch(jsonlite::stream_in(file(file.path(file_dir, file)), verbose = FALSE), error = function(e) message(file))

  tweets_temp = rbind(tweets_temp, temp)
  rm(temp)
}

tweets_all = tweets_temp %>%
  rename("twitter_handle" = screen_name) %>%
  select(twitter_handle, time, text) %>%
  mutate(twitter_handle = tolower(twitter_handle),
         text = tolower(text)) %>%
  filter(grepl("corona|virus|covid|flu", text))

find missing congressional twitter handles

missing = tweets_all %>% 
  left_join(., congress_info) %>%
  filter(is.na(last)) %>%
  select(twitter_handle) %>%
  unique()

last_names = congress_info %>%
  ungroup() %>%
  select(last) %>%
  unique() %>%
  mutate(last = tolower(last))

missing$last <- sapply(missing$twitter_handle, function(handle) {
  last <- last_names$last[sapply(last_names$last, grepl, handle)]
  }) 

missing %>%
  unnest(last, keep_empty = TRUE) %>%
  mutate(first = toupper(substring(twitter_handle, 1, 1)),
         last = ifelse(is.na(last), "", last)) %>%
  write.csv(., "missing.csv", row.names = FALSE)

missing_edited = read.csv("missing_edited.csv", stringsAsFactors = FALSE)

congress_full = congress_info %>%
  full_join(., missing_edited, by = c("first", "last")) %>%
  gather(var, twitter_handle, contains("twitter_handle")) %>%
  select(-var) %>%
  unique() %>%
  filter(!is.na(twitter_handle)) %>%
  mutate(party = ifelse(party.y == "" | is.na(party.y), party.x, party.y)) %>%
  select(-c(party.x, party.y)) %>%
  unique() %>%
  filter(!is.na(party))

congress_tweets = tweets_all %>%
  left_join(., congress_full) %>%
  mutate(time = lubridate::as_date(time),
         week = lubridate::floor_date(time, "week"),
         month = lubridate::floor_date(time, "month"))

# congress_tweets %>%
#   filter(is.na(party)) %>%
#   select(twitter_handle) %>%
#   unique()

load trump tweets

get_tweets = function(year, fromJSON = TRUE) {
  ## build and send request
  url <- paste0(
    "http://trumptwitterarchive.com/",
    "data/realdonaldtrump/",
    year,
    ".json"
  )
  ## response object
  r <- httr::GET(url)
  ## check html status
  httr::warn_for_status(r)
  ## if fromJSON then convert to list otherwise return response object
  if (fromJSON) {
    r <- httr::content(r, "text")
    ## if html return empty data frame
    if (grepl("^\\<\\!DOCTYPE", r)) {
      r <- data.frame()
    } else {
      r <- jsonlite::fromJSON(r)
    }
  }
  r
}

trump_tweets = get_tweets(year = 2020) %>%
  mutate(twitter_handle = "realdonaldtrump",
         time = as.POSIXct(created_at, format = "%a %b %d %H:%M:%S %z %Y"),
         time = lubridate::as_date(time),
         week = lubridate::floor_date(time, "week"),
         month = lubridate::floor_date(time, "month"),
         party = "trump") %>%
  select(twitter_handle, text, time, week, month, party) %>%
  filter(grepl("2020-02|2020-03", month)) %>%
  mutate(text = tolower(text)) %>%
  filter(grepl("corona|virus|covid|flu", text))

merge tweets

tweets = bind_rows(congress_tweets, trump_tweets)

ignore_root_words = "http|.com|img|jpg|video|live|index|[0-9]|corona|covid|vid|ncov|aspx|utm|t.co|png"
ignore_words = c("rt", "amp", "qt", "pu", "tag", "i'm", "it's", "i’m", "it’s", "lr", "li", "ag")

define wordcloud function

plot_tweets = function(data, party=NULL, start_date=NULL, duration=NULL, n_words=50, n_colors=6, size=20) {
  
  data = data %>%
      mutate(time = lubridate::as_date(time))
  
  if (!is.null(party)) {
    data = data %>%
      filter(party == !!party)
  }
  
  if (!is.null(start_date)) {
    if (!is.null(duration)) {
      data = data %>%
        filter(time >= start_date & time <= lubridate::date(start_date) + lubridate::days(duration))
    } else {
      data = data %>%
        filter(time >= start_date)
    }
  }
  
  palette = wesanderson::wes_palette("Zissou1", n_colors, "continuous")
  
  set.seed(42)
  
plot = data %>%
    filter(party %in% c("democrat", "republican", "trump")) %>%
    select(text, party) %>%
    unnest_tokens(word, text) %>%
    group_by(party) %>%
    count(word, sort = TRUE) %>%
    anti_join(stop_words, by = "word") %>%
    filter(!grepl(ignore_root_words, word)) %>%
    filter(!word %in% ignore_words) %>%
    slice(1:n_words) %>%
    mutate(word = gsub("\\.", "", word),
           sum = sum(n),
           size = n / sum,
           tile = ntile(n, n_colors)) %>%
    ggplot(aes(label = word, size = size, color = as.factor(tile))) +
    geom_text_wordcloud_area(shape = "square") +
      scale_size_area(max_size = size, trans = power_trans(1/.7)) +
      scale_color_manual(values = palette) +
      facet_wrap(~party) +
      theme_minimal() +
      theme(strip.text.x = element_text(size = 12))
  
  return(list(plot = plot, data = data))
}

number of tweets

How many times has Congress and the President tweeted about COVID-19?

daily

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(time, fill = party)) +
  geom_bar(stat = "count") +
  scale_fill_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(time, color = party)) +
  geom_line(stat = "count") +
  scale_color_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

weekly

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(week, fill = party)) +
  geom_bar(stat = "count") +
  scale_fill_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

tweets %>%
  filter(!is.na(party)) %>%
  ggplot(aes(week, color = party)) +
  geom_line(stat = "count") +
  scale_color_manual(name = "", values = palette4) +
  labs(x = "", y = "number of tweets\n") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

sentiment of tweets

How positive and negative is the content of the tweets?

Here is a list of the top 20 positive or negative words for each party and the President.

overall

sentiments = tweets %>%
  unnest_tokens(word, text) %>%
  inner_join(tidytext::get_sentiments("bing")) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!grepl(ignore_root_words, word)) %>%
  filter(!word %in% ignore_words) %>%
  filter(!word == "trump") %>%
  group_by(party) %>%
  count(word, sentiment, sort = TRUE) %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  #group_by(sentiment, party, month) %>%
  top_n(20) %>%
  group_by(party) %>%
  arrange(n) %>%
  mutate(order = row_number())

sentiments %>%
  ggplot(aes(drlib::reorder_within(word, n, party), n, fill = sentiment)) +
  geom_col() +
  drlib::scale_x_reordered() +
  facet_wrap(~party, scales = "free") +
  labs(y = "\nnumber of times tweeted",
       x = NULL) +
  coord_flip() +
  scale_fill_manual(name = "", values = palette2) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

by month

sentiments = tweets %>%
  unnest_tokens(word, text) %>%
  inner_join(tidytext::get_sentiments("bing")) %>%
  anti_join(stop_words, by = "word") %>%
  filter(!grepl(ignore_root_words, word)) %>%
  filter(!word %in% ignore_words) %>%
  filter(!word == "trump") %>%
  mutate(month = ifelse(month == "2020-02-01", "february", "march")) %>%
  group_by(party, month) %>%
  count(word, sentiment, sort = TRUE) %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  #group_by(sentiment, party, month) %>%
  top_n(20) %>%
  group_by(party, month) %>%
  arrange(n) %>%
  mutate(order = row_number())

sentiments %>%
  ggplot(aes(drlib::reorder_within(word, n, c("party", "month")), n, fill = sentiment)) +
  geom_col() +
  drlib::scale_x_reordered() +
  facet_wrap(month~party, scales = "free") +
  labs(y = "\nnumber of times tweeted",
       x = NULL) +
  coord_flip() +
  scale_fill_manual(name = "", values = palette2) +
  theme_minimal(base_size = 14) +
  theme(legend.position = "top")

content of tweets

What are Congress and the President saying about COVID-19?

Here are 100 the most frequently used words by each party and the President in February and March.

by month

February

p = plot_tweets(tweets, start_date = "2020-02-01", duration = 28, n_words = 100)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 1891
republican 1106
trump 24

March

p = plot_tweets(tweets, start_date = "2020-03-01", duration = 30, n_words = 100)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 7820
republican 6992
trump 83

by week

Here are 50 the most frequently used words by each party and the President for each week in February and March.

Feb 1 - Feb 7

p = plot_tweets(tweets, start_date = "2020-02-01", duration = 6)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 198
republican 133
trump 2

Feb 8 - Feb 14

p = plot_tweets(tweets, start_date = "2020-02-08", duration = 6)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 175
republican 162
trump 1

Feb 15 - Feb 21

p = plot_tweets(tweets, start_date = "2020-02-15", duration = 6)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 103
republican 80
trump 1

Feb 22 - Feb 28

p = plot_tweets(tweets, start_date = "2020-02-22", duration = 6)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 1200
republican 677
trump 19

Feb 29 - Mar 6

p = plot_tweets(tweets, start_date = "2020-02-29", duration = 6)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 2557
republican 2795
trump 24

Mar 7 - Mar 13

p = plot_tweets(tweets, start_date = "2020-03-07", duration = 6)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 5315
republican 4132
trump 54

Mar 14 - Mar 20

p = plot_tweets(tweets, start_date = "2020-03-14", duration = 6)
p$plot

p$data %>%
  filter(party %in% c("democrat", "republican", "trump")) %>%
  group_by(party) %>%
  summarize(`number of tweets` = n()) %>%
  kable(format = "pandoc")
party number of tweets
democrat 163
republican 119
trump 6